################################
## Name: 08_relatio_merge.R
## Purpose: This script pulls together the relatio 
## features and creates coding files. 
## Data In: 
## 1) bioweapons articles
## data/bioweapons_casestudy_5_20_2024_public.csv
## 2) relatio features
## note this data is not shared in the replication 
## we create a derivative dfm which is shared
## /scratch/olympus/projects/russia_ukraine_war/bioweapons/concatenated_df.csv
## 3) derivative dfm
## data/dfm_relatio.rds
## Data Out:
## 1) overall relatio coding file
## data/precision_relatio_test.rds
## 2) RA files
## data/precision_relatio_test_ra1.csv"
## data/precision_relatio_test_ra2.csv
## 3) relatio pairs
## data/relatio_edges.rds

## Notes:
## The first part of this analysis cannot be replicated
## because it includes document full texts. We include it here
## as reference and then create a derivative dfm which 
## we include in replication pacakge. 

library(tidyverse)
library(quanteda)
library(igraph)


file <- read.csv("/scratch/olympus/projects/russia_ukraine_war/bioweapons/concatenated_df.csv")

## creating features 
out <- file %>%
  group_by(id) %>%
  summarize(narrative = paste(gsub("\\s",
                                   "",
                                   unique(narrative)),
                            collapse = " "))

corpus <- corpus(out, 
                 docid_field = "id",
                 text_field = "narrative")
tokens <- tokens(corpus, what = "fastestword")
dfm <- dfm(tokens)

#saveRDS(dfm, "data/dfm_relatio.rds")

################################
### Replication ###############
#################################

dfm <- readRDS("data/dfm_relatio.rds")
art <- read.csv("data/bioweapons_casestudy_5_20_2024_public.csv")

## code for cosine similarity 
cos_sim <- function(small, big) {
  small%*%t(big)/sqrt(tcrossprod(rowSums(small^2), rowSums(big^2)))
}


dim(dfm) # 113,685 unique narratives

test <- apply(dfm, 2, function(x){sum(x> 0)})
table(test) ## all documents have at least one
## feature 

m <- cos_sim(dfm, dfm)

## decompose into graph
net <- igraph::graph_from_adjacency_matrix(as.matrix(m), mode = "undirected",
                                   weighted = TRUE)


edgelist <- as_edgelist(net)

edgelist <- as.data.frame(edgelist)
edgelist$sim <- E(net)$weight

## limit to pairs that are not self-referants
colnames(edgelist)[c(1,2)] <- c("ego_id", "alter_id")
edgelist <- edgelist %>%
  filter(ego_id != alter_id)

## limit to pairs within 5 day range
edgelist$ego_date <- as.Date(art$final_date[match(edgelist$ego_id,
                                                       art$article_id)])
edgelist$alter_date <- as.Date(art$final_date[match(edgelist$alter_id,
                                                         art$article_id)])

edgelist$date_range <- ifelse(edgelist$ego_date >= edgelist$alter_date - days(5) &
                                edgelist$ego_date <= edgelist$alter_date + days(5),
                              "yes", "no")
edgelist <- edgelist %>%
  filter(date_range == "yes")


mat <- which(colnames(edgelist) == "ego_id" |
               colnames(edgelist) == "alter_id")
edgelist$match_id <- apply(edgelist, 1, function(x){
  x <- x[mat]
  x <- x[order(x)]
  x <- paste0(x, collapse = "_")
  return(x)
})
sum(duplicated(edgelist$match_id)) ## 0


## visualizing cosine similarity
ggplot(edgelist[edgelist$sim > 0, ],
       mapping = aes(x = sim)) +
  geom_density() 
summary(edgelist$sim[edgelist$sim > 0])
## cosine similarity is quite low 


## randomly sampling pairs for validation coding
## stratified by cosine similarity (minimum .2, maximum .7)
edgelist$stratum <- ifelse(edgelist$sim < .1,
                           "Less than .1",
                           ifelse(edgelist$sim  < .2,
                                  ".1 < .2",
                                  ifelse(edgelist$sim < .4,
                                         ".2 < .4",
                                         ifelse(edgelist$sim < .6,
                                                ".4 < .6",
                                                ".6+"))))
                           

table(edgelist$stratum)
## 755,836 pairs less than .1
## 13,828 pairs between .1 and .2
## 2673 pairs betwee .2 and .4 
## 765 pairs between .4 and .6 
## 531 pairs .6+

## saving these counts for precision calculation denominator
counts_edgelist <- edgelist %>%
  group_by(stratum) %>%
  summarize(n = n())


set.seed(08540)
samp_1 <- which(edgelist$stratum == ".1 < .2")
samp_1 <- sample(samp_1, 30, replace = FALSE)
samp_2 <- which(edgelist$stratum == ".2 < .4")
samp_2 <- sample(samp_2, 30, replace = FALSE)
samp_3 <- which(edgelist$stratum == ".4 < .6")
samp_3 <- sample(samp_3, 30, replace = FALSE)
samp_4 <- which(edgelist$stratum == ".6+")
samp_4 <- sample(samp_4, 30, replace = FALSE)
samp <- c(samp_1, samp_2, samp_3, samp_4)


precision_coding <- edgelist[samp, ] 

precision_coding$ego_article <- art$summary[match(precision_coding$ego_id,
                                                        art$article_id)]
precision_coding$alter_article <- art$summary[match(precision_coding$alter_id,
                                                          art$article_id)]

precision_coding$same_subject <- NA
precision_coding$same_claim <- NA

## take permutation
precision_coding <- precision_coding[sample(1:nrow(precision_coding),
                                            nrow(precision_coding),
                                            replace = FALSE), ]

## recording how many overlapping features
precision_coding$count <- apply(precision_coding, 1, function(x){
  mat <- dfm[rownames(dfm) %in% x[1:2], ]
  mat <- apply(mat, 2, function(x){
    sum(x > 0)
  })
  ret <- sum(mat > 1)
  return(ret)
})
tapply(precision_coding$count, precision_coding$stratum, function(x){summary(x)})


## merge in denominator population data
precision_coding$stratum_denominator <- counts_edgelist$n[
  match(precision_coding$stratum,
        counts_edgelist$stratum)
]

## master_file
#saveRDS(precision_coding, "data/precision_relatio_test.rds")

## RA file
## each file should be coded by three RAs
precision_coding$coder <- c(rep("RA1", 60),
                            rep("RA2", 60))

#write.csv(precision_coding[precision_coding$coder == "RA1", c("ego_id",
#                             "alter_id",
#                             "ego_article",
#                          "alter_article",
#                            "same_subject",
#                           "same_claim")], 
#          "data/precision_relatio_test_ra1.csv")


#write.csv(precision_coding[precision_coding$coder == "RA2", c("ego_id",
#                                                             "alter_id",
#                                                             "ego_article",
#                                                              "alter_article",
#                                                              "same_subject",
#                                                              "same_claim")],
#          "data/precision_relatio_test_ra2.csv")


#########################################
### Edgelist Pairs ##################


edgelist <- edgelist %>%
  filter(sim > .1)



#saveRDS(edgelist, "data/relatio_edges.rds")




